from openai import OpenAI
import time
from gxl_ai_utils.utils import utils_file
# Set OpenAI's API key and API base to use vLLM's API server.

# 启动服务的命令： vllm serve /home/A02_tmpdata3/ckpt/transformers/qwen3_8B  --tensor-parallel-size 8

openai_api_key = "EMPTY"
openai_api_base = "http://10.21.4.2:8000/v1"

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

def chat(question, think=False):
    chat_response = client.chat.completions.create(
        model="/home/A02_tmpdata3/ckpt/transformers/qwen3_8B",
        messages=[
            {"role": "user", "content": question},
        ],
        max_tokens=8192,
        temperature=0.7,
        top_p=0.8,
        presence_penalty=1.5,
        extra_body={
            "top_k": 20,
            "chat_template_kwargs": {"enable_thinking": think},
        },
    )
    txt  = chat_response.choices[0].message.content
    return txt